{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "UiNxsd4_q9wq"
   },
   "source": [
    "### What-If Tool in a jupyter notebook\n",
    "\n",
    "This notebook shows use of the [What-If Tool](https://pair-code.github.io/what-if-tool) inside of a jupyter notebook.\n",
    "\n",
    "This notebook trains a linear classifier on the [UCI census problem](https://archive.ics.uci.edu/ml/datasets/census+income) (predicting whether a person earns more than $50K from their census information).\n",
    "\n",
    "It then visualizes the results of the trained classifier on test data using the What-If Tool.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "jlwjF-Nnmoww"
   },
   "outputs": [],
   "source": [
    "#@title Define helper functions {display-mode: \"form\"}\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import functools\n",
    "\n",
    "# Creates a tf feature spec from the dataframe and columns specified.\n",
    "def create_feature_spec(df, columns=None):\n",
    "    feature_spec = {}\n",
    "    if columns == None:\n",
    "        columns = df.columns.values.tolist()\n",
    "    for f in columns:\n",
    "        if df[f].dtype is np.dtype(np.int64):\n",
    "            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.int64)\n",
    "        elif df[f].dtype is np.dtype(np.float64):\n",
    "            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.float32)\n",
    "        else:\n",
    "            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.string)\n",
    "    return feature_spec\n",
    "\n",
    "# Creates simple numeric and categorical feature columns from a feature spec and a\n",
    "# list of columns from that spec to use.\n",
    "#\n",
    "# NOTE: Models might perform better with some feature engineering such as bucketed\n",
    "# numeric columns and hash-bucket/embedding columns for categorical features.\n",
    "def create_feature_columns(columns, feature_spec):\n",
    "    ret = []\n",
    "    for col in columns:\n",
    "        if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:\n",
    "            ret.append(tf.feature_column.numeric_column(col))\n",
    "        else:\n",
    "            ret.append(tf.feature_column.indicator_column(\n",
    "                tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))\n",
    "    return ret\n",
    "\n",
    "# An input function for providing input to a model from tf.Examples\n",
    "def tfexamples_input_fn(examples, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,\n",
    "                       num_epochs=None, \n",
    "                       batch_size=64):\n",
    "    def ex_generator():\n",
    "        for i in range(len(examples)):\n",
    "            yield examples[i].SerializeToString()\n",
    "    dataset = tf.data.Dataset.from_generator(\n",
    "      ex_generator, tf.dtypes.string, tf.TensorShape([]))\n",
    "    if mode == tf.estimator.ModeKeys.TRAIN:\n",
    "        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)\n",
    "    dataset = dataset.batch(batch_size)\n",
    "    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))\n",
    "    dataset = dataset.repeat(num_epochs)\n",
    "    return dataset\n",
    "\n",
    "# Parses Tf.Example protos into features for the input function.\n",
    "def parse_tf_example(example_proto, label, feature_spec):\n",
    "    parsed_features = tf.parse_example(serialized=example_proto, features=feature_spec)\n",
    "    target = parsed_features.pop(label)\n",
    "    return parsed_features, target\n",
    "\n",
    "# Converts a dataframe into a list of tf.Example protos.\n",
    "def df_to_examples(df, columns=None):\n",
    "    examples = []\n",
    "    if columns == None:\n",
    "        columns = df.columns.values.tolist()\n",
    "    for index, row in df.iterrows():\n",
    "        example = tf.train.Example()\n",
    "        for col in columns:\n",
    "            if df[col].dtype is np.dtype(np.int64):\n",
    "                example.features.feature[col].int64_list.value.append(int(row[col]))\n",
    "            elif df[col].dtype is np.dtype(np.float64):\n",
    "                example.features.feature[col].float_list.value.append(row[col])\n",
    "            elif row[col] == row[col]:\n",
    "                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))\n",
    "        examples.append(example)\n",
    "    return examples\n",
    "\n",
    "# Converts a dataframe column into a column of 0's and 1's based on the provided test.\n",
    "# Used to force label columns to be numeric for binary classification using a TF estimator.\n",
    "def make_label_column_numeric(df, label_column, test):\n",
    "  df[label_column] = np.where(test(df[label_column]), 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 2989
    },
    "colab_type": "code",
    "id": "nu398ARdeuxe",
    "outputId": "4ad2b2f4-259b-44f1-ec05-112a8b609260"
   },
   "outputs": [],
   "source": [
    "#@title Read training dataset from CSV {display-mode: \"form\"}\n",
    "\n",
    "import pandas as pd\n",
    "import hops.hdfs as hdfs\n",
    "\n",
    "# Set the path to the CSV containing the dataset to train on.\n",
    "# csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'\n",
    "\n",
    "# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing\n",
    "# the column names, then set this to None.\n",
    "csv_columns = [\n",
    "  \"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital-Status\",\n",
    "  \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital-Gain\", \"Capital-Loss\",\n",
    "  \"Hours-per-week\", \"Country\", \"Over-50K\"]\n",
    "\n",
    "# Read the dataset from the provided CSV and print out information about it.\n",
    "h = hdfs.get_fs()\n",
    "with h.open_file(hdfs.project_path() + \"/TourData/census/adult.data\", \"rt\") as trainFile:\n",
    "    df = pd.read_csv(trainFile, names=csv_columns, skipinitialspace=True)\n",
    "\n",
    "# df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "67DYIFxoevt2"
   },
   "outputs": [],
   "source": [
    "#@title Specify input columns and column to predict {display-mode: \"form\"}\n",
    "import numpy as np\n",
    "\n",
    "# Set the column in the dataset you wish for the model to predict\n",
    "label_column = 'Over-50K'\n",
    "\n",
    "# Make the label column numeric (0 and 1), for use in our model.\n",
    "# In this case, examples with a target value of '>50K' are considered to be in\n",
    "# the '1' (positive) class and all other examples are considered to be in the\n",
    "# '0' (negative) class.\n",
    "make_label_column_numeric(df, label_column, lambda val: val == '>50K')\n",
    "\n",
    "# Set list of all columns from the dataset we will use for model input.\n",
    "input_features = [\n",
    "  'Age', 'Workclass', 'Education', 'Marital-Status', 'Occupation',\n",
    "  'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss',\n",
    "  'Hours-per-week', 'Country']\n",
    "\n",
    "# Create a list containing all input features and the label column\n",
    "features_and_labels = input_features + [label_column]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "BV4f_4_Lex22"
   },
   "outputs": [],
   "source": [
    "#@title Convert dataset to tf.Example protos {display-mode: \"form\"}\n",
    "\n",
    "examples = df_to_examples(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "YyLr-_0de1Ii"
   },
   "outputs": [],
   "source": [
    "#@title Create and train the classifier {display-mode: \"form\"}\n",
    "\n",
    "num_steps = 5000  #@param {type: \"number\"}\n",
    "\n",
    "# Create a feature spec for the classifier\n",
    "feature_spec = create_feature_spec(df, features_and_labels)\n",
    "\n",
    "# Define and train the classifier\n",
    "train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)\n",
    "classifier = tf.estimator.LinearClassifier(\n",
    "    feature_columns=create_feature_columns(input_features, feature_spec))\n",
    "classifier.train(train_inpf, steps=num_steps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "NUQVro76e38Q"
   },
   "outputs": [],
   "source": [
    "#@title Invoke What-If Tool for test data and the trained model {display-mode: \"form\"}\n",
    "\n",
    "num_datapoints = 2000  #@param {type: \"number\"}\n",
    "tool_height_in_px = 1000  #@param {type: \"number\"}\n",
    "\n",
    "from witwidget.notebook.visualization import WitConfigBuilder\n",
    "from witwidget.notebook.visualization import WitWidget\n",
    "\n",
    "# Load up the test dataset\n",
    "#test_csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'\n",
    "#test_df = pd.read_csv(test_csv_path, names=csv_columns, skipinitialspace=True,\n",
    "#  skiprows=1)\n",
    "\n",
    "h = hdfs.get_fs()\n",
    "with h.open_file(hdfs.project_path() + \"/TourData/census/adult.test\", \"rt\") as testFile:\n",
    "    test_df = pd.read_csv(testFile, names=csv_columns, skipinitialspace=True, skiprows=1)\n",
    "\n",
    "make_label_column_numeric(test_df, label_column, lambda val: val == '>50K.')\n",
    "test_examples = df_to_examples(test_df[0:num_datapoints])\n",
    "\n",
    "# Setup the tool with the test examples and the trained classifier\n",
    "config_builder = WitConfigBuilder(test_examples).set_estimator_and_feature_spec(\n",
    "    classifier, feature_spec).set_label_vocab(['Under 50K', 'Over 50K'])\n",
    "WitWidget(config_builder, height=tool_height_in_px)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "What-If Tool Notebook Usage",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "python-demo_deep_learning_admin000__meb10000",
   "language": "python",
   "name": "python"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}